import sys
import csv
from unidecode import unidecode
import re
import unicodedata as ud
import numpy as np


def remove_non_ascii(text):
    return unidecode(unicode(text, encoding = "utf-8"))

def main():
	county = bisg_county()
	name_list = []
	with open("last_name_db.csv", 'rt') as f:
		reader = csv.reader(f, delimiter=',')
		for row in reader:
			name_list.append(row)
	#print name_list
	## 1	en	screen_name	name	-1&-1	male	41.84	52.16	1.95	2336
	mout = open("Twitter_users_eth_all.csv","wb")
	mFile = open("Twitter_user_info_file","rb")
	for mRow in mFile:
		lastName = ''
		data = mRow.strip().split('\t')
		names = data[2]
		lang = data[1]
		polyID = data[-1]
		if polyID in county:
			xx = county[polyID]
			if lang == 'es':
				mString = "0.00" + "&" + "0.00" + "&" + "1.00" + "0.00"
				mout.write(mRow.rstrip() + "\t" + mString + "\n")
				continue
			temp=remove_non_ascii(names)
			name_temp = re.sub(r'[^\x00-\x7F]+','', temp)
			name_temp2 = re.sub(r'[^\w]', ' ', name_temp)
			name_pre = name_temp2.strip().split(' ')

			if len(name_pre) > 2:
				if '.' in name_pre[-1] or len(name_pre[-1]) < 2:
					lastName = name_pre[-2]
				else:
					lastName = name_pre[-1]
			elif len(name_pre) == 2:
				lastName = name_pre[1]
			else:
				##xx = county[polyID]
				result = county_prob_nomatch(xx)
				mString = "&".join(result)
				mout.write(mRow.rstrip() + "\t" + mString + "\n")
				continue

			mLastName = lastName.replace('.','').upper()
			print mLastName

			flag = 0
			for ele in name_list:
				if ele[0] == mLastName:
					flag = 1
					#print(ele)
					name =[float(ele[5]), float(ele[6]), float(ele[-2]), float(ele[-1])]
					if max(name) > 0.0:
						result = county_prob(name,xx)
						mString = "&".join(result)
						mString2 = mString
						mout.write(mRow.rstrip() + "\t" + mString2 + "\n")
						break
					else:
						result = county_prob_nomatch(xx)
						mString = "&".join(result)
						mout.write(mRow.rstrip() + "\t" + mString + "\n")
						break
			if flag == 0:
				result = county_prob_nomatch(xx)
				mString = "&".join(result)
				mout.write(mRow.rstrip() + "\t" + mString + "\n")
	print 'program finished'

def county_prob(name, county):
	pt_w = (name[0]*county[0]) / (name[0]*county[0] + name[1]*county[1] + name[2]*county[2] + name[3]*county[3])
	pt_b = (name[1]*county[1]) / (name[0]*county[0] + name[1]*county[1] + name[2]*county[2] + name[3]*county[3])
	pt_h = (name[2]*county[2]) / (name[0]*county[0] + name[1]*county[1] + name[2]*county[2] + name[3]*county[3])
	pt_o = (name[3]*county[3]) / (name[0]*county[0] + name[1]*county[1] + name[2]*county[2] + name[3]*county[3])
	return [str(pt_w), str(pt_b), str(pt_h), str(pt_o)]

def county_prob_nomatch(county):
	## country-wide demographics probability
	name =[62.11472976, 12.40544552, 17.32957983, 8.150244887]
	pt_w = (name[0]*county[0]) / (name[0]*county[0] + name[1]*county[1] + name[2]*county[2] + name[3]*county[3])
	pt_b = (name[1]*county[1]) / (name[0]*county[0] + name[1]*county[1] + name[2]*county[2] + name[3]*county[3])
	pt_h = (name[2]*county[2]) / (name[0]*county[0] + name[1]*county[1] + name[2]*county[2] + name[3]*county[3])
	pt_o = (name[3]*county[3]) / (name[0]*county[0] + name[1]*county[1] + name[2]*county[2] + name[3]*county[3])
	return [str(pt_w), str(pt_b), str(pt_h), str(pt_o)]

def bisg_county():
	mFile = open("eth_values.csv","r")
	keys =[]
	values =[]
	for ele in mFile:
		data = ele.rstrip().split("\t")
		keys.append(data[0])
		values.append([float(data[1]),float(data[2]), float(data[3]), float(data[-1])])
	counties = dict(zip(keys,values))
	return counties

def ethinicity_county_stat():
    result = {}
    mFile = open("Twitter_users_eth_all.csv","r")
    oFile = open("ethnicity_county_results.csv","w")
    for ele in mFile:
        data = ele.rstrip().split("\t")
        county = data[-2]
        eth = data[-1].split("&")
        ethV = [float(eth[0]), float(eth[1]), float(eth[2])]
        if county in result:
            temp = result[county]
            temp.append(ethV)
        else:
            result[county] = [ethV]
    for key, value in result.items():
        print(key)
        ww = np.array(value) * 100.0
        re = np.mean(ww, axis=0)
        mString = str(key) + "\t" + "\t".join(map(str,re.tolist())) + "\n"
        oFile.write(mString)
    oFile.close()
    mFile.close()

def national():
    #result = {}
    mFile = open("Twitter_users_eth_all.csv","r")
    oFile = open("national_results.csv","w")
    w =[]
    b =[]
    h =[]
    for ele in mFile:
        data = ele.rstrip().split("\t")
        county = data[-2]
        eth = data[-1].split("&")
        w.append(float(eth[0]))
        b.append(float(eth[1]))
        h.append(float(eth[2]))

    ww = np.array(w)
    bb = np.array(b)
    hh  =np.array(h)

if __name__=='__main__':
    print "program running"
    main()
    bisg_county()
    ethinicity_county_stat()
    national()